*SM-E21
set scheme stcolor
version 17


********************************************************************************		
* STUDY LEVEL ANALYSIS
********************************************************************************
		
use "../Metadata/tess_analysisdata.dta", clear

* % hypotheses supported 
	tempvar temp
	cap drop total_hyp_true
	
	egen total_hyp_true=sum(hyp_true_insample), by(vendor_id)
	gen `temp'=  total_hyp_true/total_hyp_insample
	egen pctsupported= max(`temp'), by(vendor_id)
	
	replace successfulexp_insample = pctsupported
	replace successfulexp_insample = 0 if hyp_true_insample==0

* make variables	
		recode title (1 2 6=0) (3/5=1), gen(faculty)
		lab var faculty "Faculty PI"

		gen polsci = (discipline == 5)
		lab var polsci "Political science"

		recode numauthors (1=0) (2/10=1), gen(authors2)
		lab var authors2 "Co-authored"

		recode check_type (0=0) (1/3=1), gen(checks)
		lab var checks "Any checks"
	

		* relabel var for length
			lab var type_framing "Framing exp."
			lab var type_priming "Priming exp."
			lab var type_vignette "Vignette exp."
			lab var type_questwording "Question-wording exp."
			lab var type_information "Information provision exp."
			lab var type_beliefelicit "Belief elicitation exp."
			lab var type_writing "Writing treatment"
			lab var type_conj_dce "Conjoint/Discrete choice exp."
			
		* average level of hyp-level vars
			* at least one binary DV
			recode outcome_categories (2=1) (3/8=0), gen(bin)
			egen binarydv= mean(bin), by(vendor_id)
			lab var binarydv "Binary DV (at least 1)"

			recode outcome_categories (2/7=0) (8=1), gen(cont)
			egen continuousdv= mean(cont), by(vendor_id)
			lab var continuousdv "Continuous DV (at least 1)"				
			
			
			* number of items used to measure DV, on average across the study
			egen avgnummeasures= mean(num_measures), by(vendor_id)
			forval i=1/5 {
			lab var avgnummeasures "Avg. # of items per DV"
			}
			
			* whether the study had any of these types of hypotheses
			tab hyp_type, gen(hypcat)
			forval i=1/5 {	
			egen  hypcatany`i'=max(hypcat`i'), by(vendor_id)	
			}
			lab var hypcatany1 "Default treat. comp. (at least 1)"
			lab var hypcatany2 "Moderation test (at least 1)"
			lab var hypcatany3 "Mediation test (at least 1)"
			lab var hypcatany4 "t-test (at least 1)"
			lab var hypcatany5 "KS-test (at least 1)"	

			lab var medtime15 "Median duration (mins.)"
			
			
		* study-level analysis	
		keep if hyp_num==1	
	
* set seed
		set seed 10101
		generate u = runiform()
		sort u

		* how many variables at a time? (using 400 iterations)
		gen outofbagerror2=.
		gen validationerror2=.
		gen nvars=.

		local j=0
		forval i=1(1)25 {
			local j=`j'+1
			
		rforest successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 ///
			in 1/60, ///
			 iter(400) type(class) numvars(`i')
			 
			 
			 qui replace nvars= `i' in `j'
			 qui replace outofbagerror2= `e(OOB_Error)' in `j'
			 predict p in 61/100
			 qui replace validationerror2 = `e(error_rate)' in `j'
			 drop p
		}


			label var outofbagerror2 "Out of Bag Error"
			label var nvars "No. of variables"
			label var validationerror2 "Validation Error"
			scatter outofbagerror2 nvars, mcolor(blue) msize(tiny) || ///
			scatter validationerror2 nvars, mcolor(red) msize(tiny)


		// assess lowest validation error rate	
		cap frame drop mydata 
		frame put validationerror2 outofbagerror2 nvars, into(mydata)  
		frame mydata { 
			gen totalerror=validationerror2 +outofbagerror2
			sort totalerror, stable 
			local min_val_err = validationerror2[1] 
			local min_nvars = nvars[1] 
			} 

		di "Minimum error ="  `min_val_err' 
		di "Corresponding number of variables =" `min_nvars' 

			
		cap frame drop mydata 

* estimate model
		rforest successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 ///
			, ///
			iter(400) type(class) numvars(1)
		di e(OOB_Error) 
		cap drop p
		predict p
		di e(error_rate) // there is no error, because all data is used



* variable importance matrix
		matrix importance = e(importance)
		matrix list importance

		// create a variable with importance scores from matrix
		cap drop importance
		svmat importance

		// create a variable with row names from matrix
		cap drop importid
		generate importid= ""
		local names: rownames importance 
		local k: word count `names'
		if `k'>_N {
			set obs `k'
		}
		forval i=1(1)`k' {
			local aword: word `i' of `names'
			local alabel: var label `aword'
			if ("`alabel'"!="") qui replace importid = "`alabel'" in `i'
			else qui replace importid= "`aword'" in `i'
		}


// to add directional arrows: create a variable with correlations of var with success
		correl successfulexp_insample ///
			samplesize ///
			medtime15  ///
			type_beliefelicit type_conj_dce type_framing type_information ///
			type_priming type_questwording type_vignette type_writing ///
			hypcatany* ///
			woman ///
			faculty polsci shortstudy ///
			pilot clarity ///
			checks ///
			binarydv continuousdv avgnummeasures ///
			authors2 
		

			matrix corr = r(C)
			matrix list corr
			matselrc corr corr1, r(2/27) c(1/1)
			matrix list corr1

			// create a new variable with correlations from matrix
			cap drop corr1
			svmat corr1

			cap drop direction
			gen direction=""
			replace direction=" ({&uarr})" if corr11>0 & corr11!=.
			replace direction=" ({&darr})" if corr11<0 & corr11!=.

			// concat importid and direction
			cap drop labels
			egen labels = concat(importid direction)



* Importance score chart
		graph hbar (mean) importance1, ///
		over(labels, sort(1) descending label(labsize(3.3)) axis(noline)) ///
		bar(1, fcolor(stc2*.6) lc(none)) ///
		ytitle(importance1) ///
		ytitle("Variable importance score", size(3.2) margin(l=-8)) ///
		ylabel(, labsize(3.3) nogrid) ///
		graphregion(color(white)) scheme(stcolor) ///
		title("{it: Prop. of Supported Hypotheses at Study Level}", span size(3.5)) ///
		name(RFstudy, replace) ///
		xsize(3.5)
		

graph export "../Results/SM-E21-Figure-RandomForest_propsupport.pdf", replace
